1 이항 회귀모형

1.1 데이터

library(tidyverse)
library(rvest)

lr_data_url <- "https://en.wikipedia.org/wiki/Logistic_regression"

lr_data_raw <- read_html(x = lr_data_url) %>% 
  html_elements(".wikitable") %>% 
  html_table() %>% 
  .[[1]] 

lr_tbl <- lr_data_raw %>% 
  janitor::clean_names() %>% 
  pivot_longer(-x1) %>% 
  mutate(구분 = ifelse(str_detect(x1, "Hours"), "학습시간", "입학여부")) %>% 
  select(name, 구분, value) %>% 
  pivot_wider(names_from = 구분, values_from = value)  %>% 
  select(학습시간, 입학여부)

# fs::dir_create("data")

lr_tbl %>% 
  write_rds("data/lr_tbl.rds")
lr_tbl <-read_rds("data/lr_tbl.rds")

lr_tbl %>% 
  reactable::reactable()

1.2 요약 통계량

lr_tbl %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 20
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
학습시간 0 1 2.79 1.51 0.5 1.69 2.62 4.06 5.5 ▇▇▆▅▅
입학여부 0 1 0.50 0.51 0.0 0.00 0.50 1.00 1.0 ▇▁▁▁▇

1.3 시각화

lr_tbl %>% 
  ggplot(aes(x = 학습시간, y = 입학여부)) +
    geom_point() +
    geom_smooth(method = "glm", 
      method.args = list(family = "binomial"), 
      se = FALSE) +
      labs(title = "학습시간과 입학확률",
           x = "학습시간",
           y = "합격확률 (%)") +
      theme_light() +
      scale_y_continuous(labels = scales::percent)

1.4 내장 모형 활용

adm_lr <- glm(입학여부 ~ 학습시간, family = "binomial", data = lr_tbl)

adm_lr

Call:  glm(formula = 입학여부 ~ 학습시간, family = "binomial", 
    data = lr_tbl)

Coefficients:
(Intercept)     학습시간  
     -4.078        1.505  

Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
Null Deviance:      27.73 
Residual Deviance: 16.06    AIC: 20.06

1.5 합격 예측 시각화

library(crosstalk)

crosstalk_lr_raw <- tibble( 학습시간 = seq(from = 1, to = 5, 0.1) )

crosstalk_lr_tbl <- crosstalk_lr_raw %>% 
  mutate(합격확률 = predict(adm_lr, newdata = crosstalk_lr_raw, type = "response" )) %>% 
  left_join(lr_tbl) %>% 
  mutate(입학여부 = factor(입학여부, levels = c(0, 1), labels = c("불합격", "합격")) )

crosstalk_lr_g <- crosstalk_lr_tbl %>% 
    ggplot(aes(x = 학습시간, y = 합격확률) ) +
      geom_point() +
      geom_point(aes(x = 학습시간, y = as.numeric(입학여부) - 1, color = 입학여부 ) ) +
      geom_smooth(method = "glm", method.args = list(family = "binomial"),
      se = FALSE) +
      labs(title = "학습시간과 입학확률",
           x = "학습시간",
           y = "합격확률 (%)") +
      theme_light() +
      scale_y_continuous(labels = scales::percent)

plotly::ggplotly(crosstalk_lr_g )

1.6 직접 구현

최우추정량(MLE)을 찾는 것은 - 우도(Likelihood)값을 구하는 것과 동일하기 General-purpose optimization 에 함수를 정의해서 모수 초기화하여 함께 넣어 반복적으로 근사시켜 모수를 계산한다.

\[ NLL(y) = -{\log(p(y))} \]

\[ \min_{\theta} \sum_y {-\log(p(y;\theta))} \]

\[ \max_{\theta} \prod_y p(y;\theta) \]

sigmoid_fn <-  function(x){
  
  1/(1+exp(-x))
  
}

neg_log_likelihood <- function(par, data, y, include_alpha = T) {
  
  # 출력변수 정의
  x <- data[,names(data) != y]
  y_data <- data[,y]

  # 1. 선형결합
  if( include_alpha ){

    # 선형결합: beta_1 * x_1 + beta_2 * x_2 + ...
    linear_combination <- mapply("*", x, par[2:length(par)])

    # 알파(편향) 계수 결합 : alpha + beta_1 * x_1 + beta_2 * x_2 + ...
    theta <-  rowSums(linear_combination) + par[1]
  } else {
    theta <- rowSums(mapply("*", x, par))
  }

  # 2. 확률 계산
  p <- sigmoid_fn(theta)
  # p <- exp(theta) / (1 + exp(theta))

  # 3. 우도값 계산: -log likelihood 
  value <- - sum(y_data * log(p) + (1-y_data)*log(1-p)) 

  return(value)
}

library(optimx)

lr_opt <- optimx(
  par = c(0,0),
  fn = neg_log_likelihood,
  data = lr_tbl,
  y = "입학여부",
   method = "Nelder-Mead",
  include_alpha = TRUE,
  itnmax = 100,
  control = list(trace = TRUE, all.methods = FALSE)
)
fn is  fn 
Looking for method =  Nelder-Mead 
Function has  2  arguments
Analytic gradient not made available.
Analytic Hessian not made available.
Scale check -- log parameter ratio= -Inf   log bounds ratio= NA 
Method:  Nelder-Mead 
  Nelder-Mead direct search function minimizer
function value for initial parameters = 13.862944
  Scaled convergence tolerance is 2.06574e-07
Stepsize computed as 0.100000
BUILD              3 13.887933 13.096825
EXTENSION          5 13.862944 12.582216
EXTENSION          7 13.096825 12.067907
EXTENSION          9 12.582216 11.285614
LO-REDUCTION      11 12.067907 11.285614
LO-REDUCTION      13 11.874408 11.285614
EXTENSION         15 11.469089 10.348151
LO-REDUCTION      17 11.285614 10.348151
EXTENSION         19 10.370274 8.914547
LO-REDUCTION      21 10.348151 8.914547
EXTENSION         23 9.266657 8.226456
REFLECTION        25 8.914547 8.030679
LO-REDUCTION      27 8.259889 8.030679
LO-REDUCTION      29 8.226456 8.030679
LO-REDUCTION      31 8.114076 8.030679
HI-REDUCTION      33 8.053435 8.030679
HI-REDUCTION      35 8.052924 8.030679
LO-REDUCTION      37 8.038012 8.030679
HI-REDUCTION      39 8.033349 8.030679
LO-REDUCTION      41 8.031439 8.030144
HI-REDUCTION      43 8.030679 8.030087
HI-REDUCTION      45 8.030144 8.029950
HI-REDUCTION      47 8.030087 8.029938
HI-REDUCTION      49 8.029950 8.029917
HI-REDUCTION      51 8.029938 8.029881
LO-REDUCTION      53 8.029917 8.029881
HI-REDUCTION      55 8.029890 8.029881
LO-REDUCTION      57 8.029889 8.029880
HI-REDUCTION      59 8.029881 8.029880
HI-REDUCTION      61 8.029880 8.029879
HI-REDUCTION      63 8.029880 8.029879
REFLECTION        65 8.029879 8.029879
Exiting from Nelder Mead minimizer
    67 function evaluations used
Post processing for method  Nelder-Mead 
Successful convergence! 
Compute Hessian approximation at finish of  Nelder-Mead 
Compute gradient approximation at finish of  Nelder-Mead 
Save results from method  Nelder-Mead 
$par
[1] -4.076953  1.504453

$value
[1] 8.029879

$message
NULL

$convcode
[1] 0

$fevals
function 
      67 

$gevals
gradient 
      NA 

$nitns
[1] NA

$kkt1
[1] TRUE

$kkt2
[1] TRUE

$xtimes
user.self 
     0.08 

Assemble the answers
lr_opt[, 1:5] %>% 
  as.data.frame() %>% 
  rownames_to_column("method") %>% 
  filter(method == "Nelder-Mead") %>% 
  select(method, p1, p2)
       method        p1       p2
1 Nelder-Mead -4.076953 1.504453

glm() 함수로 구현한 것과 값이 동일한지 상호확인한다.

adm_lr$coefficients
(Intercept)    학습시간 
  -4.077713    1.504645 

1.7 예측값 생성

predict_fn <- function(x, par){

  if( ncol(x) < length(par) ){
    theta <- rowSums(mapply("*", x, par[2:length(par)])) +  as.numeric(par[1])
  } else {
    theta <- rowSums(mapply("*", x, par))
  }

  prob <- sigmoid_fn(theta)

  return(prob)
}

custom_pred <- predict_fn(lr_tbl %>% select(학습시간),  lr_opt[, 1:2])

lr_tbl  %>% 
  mutate(자체제작_예측 = custom_pred) %>% 
  mutate(예측 = predict(adm_lr, newdata = lr_tbl %>% select(학습시간), type ="response")) %>% 
  reactable::reactable()

2 신경망

library(nnet)
library(NeuralNetTools)

lr_nn_tbl <- lr_tbl %>% 
  rename(adm_yn = 입학여부,
         learning_hour = 학습시간)

adm_nn <- nnet(adm_yn ~ learning_hour,
               size = 2,
               softmax = FALSE,
               data = lr_nn_tbl)
# weights:  7
initial  value 5.508862 
iter  10 value 2.774533
iter  20 value 2.721744
iter  30 value 2.601059
iter  40 value 2.486752
iter  50 value 2.254115
iter  60 value 2.226825
iter  70 value 2.226668
iter  80 value 2.226582
iter  90 value 2.226488
iter 100 value 2.226445
final  value 2.226445 
stopped after 100 iterations
devtools::source_url('https://gist.githubusercontent.com/Peque/41a9e20d6687f2f3108d/raw/85e14f3a292e126f1454864427e3a189c2fe33f3/nnet_plot_update.r')

plotnet(adm_nn)

 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com